/***
This do-file produces the Median Household Income ZIP-level data we use from the 
CSV downloaded from Census.
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Create required folders
cap mkdir "${root}/data/derived/ACS 2014-2018 5-Year ZCTA"
cap mkdir "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables"

*-------------------------------------------------------------------------------
* Load and clean raw data
*-------------------------------------------------------------------------------

cd "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables"

project, uses("${root}/data/dvc/ACS 2014-2018 5-Year ZCTA/Median Household Income/ACSST5Y2018.S1903_2020-08-21T102635.zip") raw
unzipfile "${root}/data/dvc/ACS 2014-2018 5-Year ZCTA/Median Household Income/ACSST5Y2018.S1903_2020-08-21T102635.zip", replace

project, uses("${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACSST5Y2018.S1903_data_with_overlays_2020-08-21T094359.csv")
import delimited "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACSST5Y2018.S1903_data_with_overlays_2020-08-21T094359.csv", varnames(1) clear

* Remove sections of these variables other than the 5-digit ZCTA
replace geo_id = subinstr(geo_id, "8600000US", "", .)
replace name = subinstr(name, "ZCTA5 ", "", .)

* Drop row containing descriptions of variables
drop in 1

* Variables are now identical so one can be dropped and the remaining one renamed to "zcta5"
assert geo_id == name
drop geo_id
rename name zcta5
assert strlen(zcta5) == 5

* Keeping only the ZCTA variable and the median household income variable
keep zcta5 s1903_c03_001e
rename s1903_c03_001e medhhinc_2014_2018_est

* Replace non-numeric missing values with an empty string and non-numeric approximate values with the nearest 
replace medhhinc_2014_2018_est = "" if medhhinc_2014_2018_est == "-"
replace medhhinc_2014_2018_est = "" if medhhinc_2014_2018_est == "null"
replace medhhinc_2014_2018_est = "2499" if medhhinc_2014_2018_est == "2,500-"
replace medhhinc_2014_2018_est = "250001" if medhhinc_2014_2018_est == "250,000+"

* Convert all variables to numeric
destring zcta5, replace
destring medhhinc_2014_2018_est, replace

*-------------------------------------------------------------------------------
* Save file
*-------------------------------------------------------------------------------

save "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACS 2014-2018 Median Household Income.dta", replace
project, creates("${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACS 2014-2018 Median Household Income.dta")
